Data Science Pipeline

EDA



In [10]:

    
%matplotlib inline
import pandas as pd # TO BE ADDED
iris_filename = './datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, header=None, names= ['sepal length', 'sepal width', 'petal length', 'petal width', 'target'])
iris.head()









    Out[10]:






  
    
      
      sepal length
      sepal width
      petal length
      petal width
      target
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [11]:

    
iris.describe()









    Out[11]:






  
    
      
      sepal length
      sepal width
      petal length
      petal width
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      5.843333
      3.054000
      3.758667
      1.198667
    
    
      std
      0.828066
      0.433594
      1.764420
      0.763161
    
    
      min
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      7.900000
      4.400000
      6.900000
      2.500000



In [12]:

    
iris.boxplot()









    



/usr/local/lib/python2.7/dist-packages/IPython/kernel/__main__.py:1: FutureWarning: 
The default value for 'return_type' will change to 'axes' in a future release.
 To use the future behavior now, set return_type='axes'.
 To keep the previous behavior and silence this warning, set return_type='dict'.
  if __name__ == '__main__':






    Out[12]:





{'boxes': [<matplotlib.lines.Line2D at 0x7f84d801c710>,
  <matplotlib.lines.Line2D at 0x7f84d80443d0>,
  <matplotlib.lines.Line2D at 0x7f84d805ab90>,
  <matplotlib.lines.Line2D at 0x7f84d7ffec10>],
 'caps': [<matplotlib.lines.Line2D at 0x7f84d802b610>,
  <matplotlib.lines.Line2D at 0x7f84d802bc50>,
  <matplotlib.lines.Line2D at 0x7f84d804f250>,
  <matplotlib.lines.Line2D at 0x7f84d804f890>,
  <matplotlib.lines.Line2D at 0x7f84d7fe6e50>,
  <matplotlib.lines.Line2D at 0x7f84d7ff14d0>,
  <matplotlib.lines.Line2D at 0x7f84d800ba90>,
  <matplotlib.lines.Line2D at 0x7f84d8017110>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f84d8036910>,
  <matplotlib.lines.Line2D at 0x7f84d805a550>,
  <matplotlib.lines.Line2D at 0x7f84d7ffe190>,
  <matplotlib.lines.Line2D at 0x7f84d8017d90>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x7f84d80362d0>,
  <matplotlib.lines.Line2D at 0x7f84d804fed0>,
  <matplotlib.lines.Line2D at 0x7f84d7ff1b10>,
  <matplotlib.lines.Line2D at 0x7f84d8017750>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7f84d801c8d0>,
  <matplotlib.lines.Line2D at 0x7f84d801cf90>,
  <matplotlib.lines.Line2D at 0x7f84d8044590>,
  <matplotlib.lines.Line2D at 0x7f84d8044bd0>,
  <matplotlib.lines.Line2D at 0x7f84d7fe61d0>,
  <matplotlib.lines.Line2D at 0x7f84d7fe6810>,
  <matplotlib.lines.Line2D at 0x7f84d7ffedd0>,
  <matplotlib.lines.Line2D at 0x7f84d800b450>]}



In [13]:

    
iris.quantile([0.1, 0.9])









    Out[13]:






  
    
      
      sepal length
      sepal width
      petal length
      petal width
    
  
  
    
      0.1
      4.8
      2.50
      1.4
      0.2
    
    
      0.9
      6.9
      3.61
      5.8
      2.2



In [14]:

    
iris.target.unique()









    Out[14]:





array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)



In [15]:

    
pd.crosstab(iris['petal length'] > 3.758667, iris['petal width'] > 1.198667)









    Out[15]:






  
    
      petal width
      False
      True
    
    
      petal length
      
      
    
  
  
    
      False
      56
      1
    
    
      True
      4
      89



In [16]:

    
import matplotlib.pyplot as plt

plt.scatter(iris['petal width'], iris['petal length'], alpha=1.0, color='k')
plt.xlabel("petal width")
plt.ylabel("petal length")









    Out[16]:





<matplotlib.text.Text at 0x7f84d7e4ac90>



In [17]:

    
plt.hist(iris['petal width'], bins=20)
plt.xlabel("petal width distribution")









    Out[17]:





<matplotlib.text.Text at 0x7f84d7e55590>

Feature Creation



In [18]:

    
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
cali = datasets.california_housing.fetch_california_housing()
X = cali['data']
Y = cali['target']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8)









    



downloading Cal. housing from http://lib.stat.cmu.edu/modules.php?op=modload&name=Downloads&file=index&req=getit&lid=83 to /home/moonbury/scikit_learn_data






    



---------------------------------------------------------------------------
BadZipfile                                Traceback (most recent call last)
<ipython-input-18-9d23c860fc4e> in <module>()
      3 from sklearn.cross_validation import train_test_split
      4 from sklearn.metrics import mean_squared_error
----> 5 cali = datasets.california_housing.fetch_california_housing()
      6 X = cali['data']
      7 Y = cali['target']

/home/moonbury/.local/lib/python2.7/site-packages/sklearn/datasets/california_housing.pyc in fetch_california_housing(data_home, download_if_missing)
     93         fhandle = urlopen(DATA_URL)
     94         buf = BytesIO(fhandle.read())
---> 95         zip_file = ZipFile(buf)
     96         try:
     97             cadata_fd = zip_file.open('cadata.txt', 'r')

/usr/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
    768         try:
    769             if key == 'r':
--> 770                 self._RealGetContents()
    771             elif key == 'w':
    772                 # set the modified flag so central directory gets written

/usr/lib/python2.7/zipfile.pyc in _RealGetContents(self)
    809             raise BadZipfile("File is not a zip file")
    810         if not endrec:
--> 811             raise BadZipfile, "File is not a zip file"
    812         if self.debug > 1:
    813             print endrec

BadZipfile: File is not a zip file



In [19]:

    
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor()
regressor.fit(X_train, Y_train)
Y_est = regressor.predict(X_test)
print "MAE=", mean_squared_error(Y_test, Y_est)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-f3278961c322> in <module>()
      1 from sklearn.neighbors import KNeighborsRegressor
      2 regressor = KNeighborsRegressor()
----> 3 regressor.fit(X_train, Y_train)
      4 Y_est = regressor.predict(X_test)
      5 print "MAE=", mean_squared_error(Y_test, Y_est)

NameError: name 'X_train' is not defined



In [ ]:

    
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print "MAE=", mean_squared_error(Y_test, Y_est)



In [ ]:

    
non_linear_feat = 5 # AveOccup 
X_train_new_feat = np.sqrt(X_train[:,non_linear_feat])
X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)
X_train_extended = np.hstack([X_train, X_train_new_feat])
X_test_new_feat = np.sqrt(X_test[:,non_linear_feat])
X_test_new_feat.shape = (X_test_new_feat.shape[0], 1)
X_test_extended = np.hstack([X_test, X_test_new_feat])
scaler = StandardScaler()
X_train_extended_scaled = scaler.fit_transform(X_train_extended)
X_test_extended_scaled = scaler.transform(X_test_extended)
regressor = KNeighborsRegressor()
regressor.fit(X_train_extended_scaled, Y_train)
Y_est = regressor.predict(X_test_extended_scaled)
print "MAE=", mean_squared_error(Y_test, Y_est)

Dimensionality Reduction

Covariance matrix



In [ ]:

    
from sklearn import datasets
import numpy as np
iris = datasets.load_iris()
cov_data = np.corrcoef(iris.data.T)
print iris.feature_names
print cov_data



In [ ]:

    
import matplotlib.pyplot as plt
img = plt.matshow(cov_data, cmap=plt.cm.winter)
plt.colorbar(img, ticks=[-1, 0, 1])



In [ ]:

    
from sklearn.decomposition import PCA
pca_2c = PCA(n_components=2)
X_pca_2c = pca_2c.fit_transform(iris.data)
X_pca_2c.shape



In [ ]:

    
plt.scatter(X_pca_2c[:,0], X_pca_2c[:,1], c=iris.target, alpha=0.8, edgecolors='none')
plt.show()
pca_2c.explained_variance_ratio_.sum()



In [ ]:

    
pca_2c.components_



In [ ]:

    
pca_2cw = PCA(n_components=2, whiten=True)
X_pca_1cw = pca_2cw.fit_transform(iris.data)
plt.scatter(X_pca_1cw[:,0], X_pca_1cw[:,1], c=iris.target, alpha=0.8, edgecolors='none'); plt.show()
pca_2cw.explained_variance_ratio_.sum()



In [ ]:

    
pca_1c = PCA(n_components=1)
X_pca_1c = pca_1c.fit_transform(iris.data)
plt.scatter(X_pca_1c[:,0], np.zeros(X_pca_1c.shape), c=iris.target, alpha=0.8, edgecolors='none'); plt.show()
pca_1c.explained_variance_ratio_.sum()



In [ ]:

    
pca_95pc = PCA(n_components=0.95)
X_pca_95pc = pca_95pc.fit_transform(iris.data)
print pca_95pc.explained_variance_ratio_.sum()
print X_pca_95pc.shape

A variation of PCA for big data: randomized PCA



In [ ]:

    
from sklearn.decomposition import RandomizedPCA
rpca_2c = RandomizedPCA(n_components=2)
X_rpca_2c = rpca_2c.fit_transform(iris.data)
plt.scatter(X_rpca_2c[:,0], X_rpca_2c[:,1], c=iris.target, alpha=0.8, edgecolors='none'); plt.show()
rpca_2c.explained_variance_ratio_.sum()

Latent Factor Analysis



In [ ]:

    
from sklearn.decomposition import FactorAnalysis
fact_2c = FactorAnalysis(n_components=2)
X_factor = fact_2c.fit_transform(iris.data)
plt.scatter(X_factor[:,0], X_factor[:,1], c=iris.target, alpha=0.8, edgecolors='none'); plt.show()

Linear Discriminant Analysis, LDA



In [ ]:

    
from sklearn.lda import LDA
lda_2c = LDA(n_components=2)
X_lda_2c = lda_2c.fit_transform(iris.data, iris.target)
plt.scatter(X_lda_2c[:,0], X_lda_2c[:,1], c=iris.target, alpha=0.8, edgecolors='none'); plt.show()

Latent Semantical Analysis, LSA



In [ ]:

    
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer()
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
from sklearn.decomposition import TruncatedSVD
tsvd_2c = TruncatedSVD(n_components=50)
tsvd_2c.fit(word_freq)
np.array(tf_vect.get_feature_names())[tsvd_2c.components_[20].argsort()[-10:][::-1]]

Kernel PCA



In [ ]:

    
def circular_points (radius, N):
           return np.array([[np.cos(2*np.pi*t/N)*radius, np.sin(2*np.pi*t/N)*radius] for t in xrange(N)])
N_points = 50
fake_circular_data = np.vstack([circular_points(1.0, N_points), circular_points(5.0, N_points)])
fake_circular_data += np.random.rand(*fake_circular_data.shape)
fake_circular_target = np.array([0]*N_points + [1]*N_points)
plt.scatter(fake_circular_data[:,0], fake_circular_data[:,1], c=fake_circular_target, alpha=0.8, edgecolors='none'); plt.show()



In [ ]:

    
from sklearn.decomposition import KernelPCA
kpca_2c = KernelPCA(n_components=2, kernel='rbf')
X_kpca_2c = kpca_2c.fit_transform(fake_circular_data)
plt.scatter(X_kpca_2c[:,0], X_kpca_2c[:,1], c=fake_circular_target, alpha=0.8, edgecolors='none'); plt.show()

Restricted Boltzmann Machine, RBM



In [ ]:

    
from sklearn import preprocessing
from sklearn.neural_network import BernoulliRBM
n_components = 64 # Try with 64, 100, 144
olivetti_faces = datasets.fetch_olivetti_faces()
X = preprocessing.binarize(preprocessing.scale(olivetti_faces.data), 0.5)
rbm = BernoulliRBM(n_components=n_components, learning_rate=0.01, n_iter=100)
rbm.fit(X)
plt.figure(figsize=(4.2, 4))
for i, comp in enumerate(rbm.components_):
    plt.subplot(int(np.sqrt(n_components+1)), int(np.sqrt(n_components+1)), i + 1)
    plt.imshow(comp.reshape((64, 64)), cmap=plt.cm.gray_r, interpolation='nearest')
    plt.xticks(()); plt.yticks(())
    
plt.suptitle(str(n_components) + ' components extracted by RBM', fontsize=16)
plt.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)

plt.show()

Detection and Treatment of Outliers

Univariate outlier detection



In [ ]:

    
from sklearn.datasets import load_boston
boston = load_boston()
continuous_variables = [n for n in range(shape(boston.data)[1]) if n!=3]



In [ ]:

    
import numpy as np
from sklearn import preprocessing
normalized_data = preprocessing.StandardScaler().fit_transform(boston.data[:,continuous_variables])
outliers_rows, outliers_columns = np.where(np.abs(normalized_data)>3)



In [ ]:

    
print outliers_rows



In [ ]:

    
print zip(outliers_rows, outliers_columns)

EllipticEnvelope



In [ ]:

    
# Create an artificial distribution made of blobs
from sklearn.datasets import make_blobs
blobs = 1
blob = make_blobs(n_samples=100, n_features=2, centers=blobs, cluster_std=1.5, shuffle=True, random_state=5)
# Robust Covariance Estimate
from sklearn.covariance import EllipticEnvelope
robust_covariance_est = EllipticEnvelope(contamination=.1).fit(blob[0])
detection = robust_covariance_est.predict(blob[0])
outliers = np.where(detection==-1)
inliers = np.where(detection==1)
# Draw the distribution and the detected outliers
from matplotlib import pyplot as plt
# Just the distribution
plt.plot(blob[0][:,0],blob[0][:,1], 'x', markersize=10, color='blue', alpha=0.8)
plt.show()
# The distribution and the outliers
a = plt.plot(blob[0][inliers,0],blob[0][inliers,1], 'x', markersize=10, color='blue', alpha=0.8, label='inliers')
b = plt.plot(blob[0][outliers,0],blob[0][outliers,1], 'o', markersize=6,color='red', alpha=0.8, label='outliers')
plt.legend((a[0],b[0]),('inliers','outliers'),numpoints=1,loc='lower right')
plt.show()



In [ ]:

    
from sklearn.decomposition import PCA
# Normalized data relative to continuos variables
continuous_variables = [n for n in range(shape(boston.data)[1]) if n!=3]
normalized_data = preprocessing.StandardScaler().fit_transform(boston.data[:,continuous_variables])
# Just for visualization purposes pick the first 2 PCA components
pca = PCA(n_components=2)
Zscore_components = pca.fit_transform(normalized_data)
vtot = 'PCA Variance explained ' + str(round(np.sum(pca.explained_variance_ratio_),3))
v1 = str(round(pca.explained_variance_ratio_[0],3))
v2 = str(round(pca.explained_variance_ratio_[1],3))



In [ ]:

    
# Robust Covariance Estimate
robust_covariance_est = EllipticEnvelope(store_precision=False, assume_centered = False, contamination=.05)
robust_covariance_est.fit(normalized_data)
detection = robust_covariance_est.predict(normalized_data)
outliers = np.where(detection==-1)
regular = np.where(detection==1)



In [ ]:

    
# Draw the distribution and the detected outliers
from matplotlib import pyplot as plt
a = plt.plot(Zscore_components[regular,0],Zscore_components[regular,1], 'x', markersize=2, color='blue', alpha=0.6, label='inliers')
b = plt.plot(Zscore_components[outliers,0],Zscore_components[outliers,1], 'o', markersize=6,color='red', alpha=0.8, label='outliers')
plt.xlabel('1st component ('+v1+')')
plt.ylabel('2nd component ('+v2+')')
plt.xlim([-7,7])
plt.ylim([-6,6])
plt.legend((a[0],b[0]),('inliers','outliers'),numpoints=1,loc='best')
plt.title(vtot)
plt.show()

OneClassSVM



In [ ]:

    
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import svm
# Normalized data relative to continuos variables
continuous_variables = [n for n in range(shape(boston.data)[1]) if n!=3]
normalized_data = preprocessing.StandardScaler().fit_transform(boston.data[:,continuous_variables])
# Just for visualization purposes pick the first 5 PCA components
pca = PCA(n_components=5)
Zscore_components = pca.fit_transform(normalized_data)
vtot = 'PCA Variance explained ' + str(round(np.sum(pca.explained_variance_ratio_),3))
# OneClassSVM fitting and estimates
outliers_fraction = 0.02 # 
nu_estimate = 0.95 * outliers_fraction + 0.05
machine_learning = svm.OneClassSVM(kernel="rbf", gamma=1.0/len(normalized_data), degree=3, nu=nu_estimate)
machine_learning.fit(normalized_data)
detection = machine_learning.predict(normalized_data)
outliers = np.where(detection==-1)
regular = np.where(detection==1)



In [ ]:

    
# Draw the distribution and the detected outliers
from matplotlib import pyplot as plt
for r in range(1,5):
  a = plt.plot(Zscore_components[regular,0],Zscore_components[regular,r], 'x', markersize=2, color='blue', alpha=0.6, label='inliers')
  b = plt.plot(Zscore_components[outliers,0],Zscore_components[outliers,r], 'o', markersize=6,color='red', alpha=0.8, label='outliers')
  plt.xlabel('Component 1 ('+str(round(pca.explained_variance_ratio_[0],3))+')')
  plt.ylabel('Component '+str(r+1)+'('+str(round(pca.explained_variance_ratio_[r],3))+')')
  plt.xlim([-7,7])
  plt.ylim([-6,6])
  plt.legend((a[0],b[0]),('inliers','outliers'),numpoints=1,loc='best')
  plt.title(vtot)
  plt.show()

Scoring functions

Multilabel classification



In [ ]:

    
from sklearn import datasets
iris = datasets.load_iris()
# No crossvalidation for this dummy notebook
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.50, random_state=4)
# Use a very bad multiclass classifier
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=2)
classifier.fit(X_train, Y_train) 
Y_pred = classifier.predict(X_test)
iris.target_names



In [ ]:

    
from sklearn import metrics # TO BE CORRECTED
cm = metrics.confusion_matrix(Y_test, Y_pred) # TO BE CORRECTED



In [ ]:

    
print cm



In [ ]:

    
print "Accuracy:", metrics.accuracy_score(Y_test, Y_pred)



In [ ]:

    
print "Precision:", metrics.precision_score(Y_test, Y_pred)



In [ ]:

    
print "Recall:", metrics.recall_score(Y_test, Y_pred)



In [ ]:

    
print "F1 score:", metrics.f1_score(Y_test, Y_pred)



In [ ]:

    
from sklearn.metrics import classification_report
print classification_report(Y_test, Y_pred, target_names=iris.target_names)

Regression



In [ ]:

    
from sklearn.metrics import mean_absolute_error
mean_absolute_error([1.0, 0.0, 0.0], [0.0, 0.0, -1.0])



In [ ]:

    
from sklearn.metrics import mean_squared_error
mean_squared_error([-10.0, 0.0, 0.0], [0.0, 0.0, 0.0])

Testing and Validating



In [ ]:

    
from sklearn.datasets import load_digits
digits = load_digits()
print digits.DESCR
X = digits.data
y = digits.target



In [ ]:

    
X[0]



In [ ]:

    
from sklearn import svm
h1 = svm.LinearSVC(C=1.0) # linear SVC
h2 = svm.SVC(kernel='rbf', degree=3, gamma=0.001, C=1.0) # Radial basis SVC
h3 = svm.SVC(kernel='poly', degree=3, C=1.0) # 3rd degree polynomial SVC



In [ ]:

    
h1.fit(X,y)
print h1.score(X,y)



In [ ]:

    
from sklearn import cross_validation #ATTENTION, FORGOTTEN IN THE DRAFT, TO BE ADDED



In [ ]:

    
chosen_random_state = 1
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.30, random_state=chosen_random_state)
print "X train shape %s, X test shape %s, \ny train shape %s, y test shape %s" % (X_train.shape, X_test.shape, y_train.shape, y_test.shape)
h1.fit(X_train,y_train)
print h1.score(X_test,y_test) # Returns the mean accuracy on the given test data and labels



In [ ]:

    
chosen_random_state = 1
X_train, X_validation_test, y_train, y_validation_test = cross_validation.train_test_split(X, y, test_size=.40,
                                                                                           random_state=chosen_random_state)
X_validation, X_test, y_validation, y_test = cross_validation.train_test_split(X_validation_test, y_validation_test, 
                                                                                           test_size=.50, random_state=chosen_random_state)
print "X train shape, %s, X validation shape %s, X test shape %s, \ny train shape %s, y validation shape %s, y test shape %s\n" % (X_train.shape, X_validation.shape, X_test.shape, y_train.shape, y_validation.shape, y_test.shape)
for hypothesis in [h1, h2, h3]:
    hypothesis.fit(X_train,y_train)
    print "%s -> validation mean accuracy = %0.3f" % (hypothesis, hypothesis.score(X_validation,y_validation))    
h2.fit(X_train,y_train)
print "\n%s -> test mean accuracy = %0.3f" % (h2, h2.score(X_test,y_test))

Cross Validation



In [ ]:

    
choosen_random_state = 1
cv_folds = 10 # Try 3, 5 or 20
eval_scoring='accuracy' # Try also f1
workers = -1 # this will use all your CPU power
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.30, random_state=choosen_random_state)
for hypothesis in [h1, h2, h3]:
    scores = cross_validation.cross_val_score(hypothesis, X_train, y_train, cv=cv_folds, scoring= eval_scoring, n_jobs=workers)
    print "%s -> cross validation accuracy: mean = %0.3f std = %0.3f" % (hypothesis, np.mean(scores), np.std(scores))



In [ ]:

    
scores

Using cross-validation iterators



In [ ]:

    
kfolding = cross_validation.KFold(n=100, n_folds=10, shuffle=True, random_state=1)
for train_idx, validation_idx in kfolding:
    print train_idx, validation_idx



In [ ]:

    
h1.fit(X[train_idx],y[train_idx])
h1.score(X[validation_idx],y[validation_idx])

Sampling and Bootstrapping



In [ ]:

    
subsampling = cross_validation.ShuffleSplit(n=100, n_iter=10, test_size=0.1, random_state=1)
for train_idx, validation_idx in subsampling:
    print train_idx, validation_idx



In [ ]:

    
import random
def Bootstrap(n, n_iter=3, random_state=None):
    """
    Random sampling with replacement cross-validation generator.
    For each iter a sample bootstrap of the indexes [0, n) is 
    generated and the function returns the obtained sample 
    and a list of all theexcluded indexes.
    """
    if random_state:
        random.seed(random_state)
    for j in range(n_iter):
        bs = [random.randint(0, n-1) for i in range(n)]
        out_bs = list({i for i in range(n)} - set(bs))
        yield bs, out_bs

boot = Bootstrap(n=100, n_iter=10, random_state=1)
for train_idx, validation_idx in boot:
    print train_idx, validation_idx

Hyper-parameters Optimization



In [ ]:

    
from sklearn.datasets import load_digits
digits = load_digits()
X, y = digits.data, digits.target



In [ ]:

    
from sklearn import svm
h = svm.SVC()
hp = svm.SVC(probability=True, random_state=1)



In [ ]:

    
from sklearn import grid_search
search_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
scorer = 'accuracy'



In [ ]:

    
search_func = grid_search.GridSearchCV(estimator=h, param_grid=search_grid, scoring=scorer, n_jobs=-1, iid=False, refit=True, cv=10)
%timeit search_func.fit(X,y)
print search_func.best_estimator_
print search_func.best_params_
print search_func.best_score_

Building custom scoring functions



In [ ]:

    
from sklearn.metrics import log_loss, make_scorer
Log_Loss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)



In [ ]:

    
search_func = grid_search.GridSearchCV(estimator=hp, param_grid=search_grid, scoring=Log_Loss, n_jobs=-1, iid=False, refit=True, cv=3)
search_func.fit(X,y)
print search_func.best_score_
print search_func.best_params_



In [ ]:

    
import numpy as np
from sklearn.preprocessing import LabelBinarizer
def my_custom_log_loss_func(ground_truth, p_predictions, penalty = list(), eps=1e-15):
    adj_p = np.clip(p_predictions, eps, 1 - eps)
    lb = LabelBinarizer()
    g = lb.fit_transform(ground_truth)
    if g.shape[1] == 1:
        g = np.append(1 - g, g, axis=1)
    if penalty:
        g[:,penalty] = g[:,penalty] * 2
    summation = np.sum(g * np.log(adj_p))
    return summation * (-1.0/len(ground_truth))



In [ ]:

    
my_custom_scorer = make_scorer(my_custom_log_loss_func, greater_is_better=False, needs_proba=True, penalty = [4,9])



In [ ]:

    
from sklearn import grid_search

search_grid = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}]search_func = grid_search.GridSearchCV(estimator=hp, param_grid=search_grid, scoring=my_custom_scorer, n_jobs=1, iid=False, refit=True, cv=3)

search_func.fit(X,y)
print search_func.best_score_
print search_func.best_params_

Reducing grid search run time



In [ ]:

    
search_dict = {'kernel': ['linear','rbf'],'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}
scorer = 'accuracy'
search_func = grid_search.RandomizedSearchCV(estimator=h, param_distributions=search_dict, n_iter=7, scoring=scorer, 
                                          n_jobs=-1, iid=False, refit=True, cv=10)
%timeit search_func.fit(X,y)
print search_func.best_estimator_
print search_func.best_params_
print search_func.best_score_



In [ ]:

    
search_func.grid_scores_

Feature Selection



In [ ]:

    
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=800, n_features=100, n_informative=25, n_redundant=0, random_state=101)

Univariate selection



In [ ]:

    
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2, f_classif
from sklearn.preprocessing import Binarizer, scale
Xbin = Binarizer().fit_transform(scale(X))
# if you use chi2, input X must be non-negative: X must contain booleans or frequencies
# hence the choice to binarize after the normalization if the variable if above the average
Selector_chi2 = SelectPercentile(chi2, percentile=25).fit(Xbin, y)
Selector_f_classif = SelectPercentile(f_classif, percentile=25).fit(X, y)
chi_scores = Selector_chi2.get_support()
f_classif_scores = Selector_f_classif.get_support()
selected = chi_scores & f_classif_scores # use the bitwise and operator

Recursive elimination



In [ ]:

    
from sklearn.cross_validation import train_test_split
X, y = make_classification(n_samples=100, n_features=100, n_informative=5, n_redundant=2, random_state=101)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)



In [ ]:

    
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=101)
classifier.fit(X_train, y_train)
print 'In-sample accuracy: %0.3f' % classifier.score(X_train, y_train)
print 'Out-of-sample accuracy: %0.3f' % classifier.score(X_test, y_test)



In [ ]:

    
from sklearn.feature_selection import RFECV
selector = RFECV(estimator=classifier, step=1, cv=10, scoring='accuracy')
selector.fit(X_train, y_train)
print("Optimal number of features : %d" % selector.n_features_)



In [ ]:

    
X_train_s = selector.transform(X_train)
X_test_s = selector.transform(X_test)
classifier.fit(X_train_s, y_train)
print 'Out-of-sample accuracy: %0.3f' % classifier.score(X_test_s, y_test)

Stability and L1 based selection



In [ ]:

    
from sklearn.svm import LinearSVC
classifier = LogisticRegression(C=0.1, penalty='l1', random_state=101) # the smaller C the fewer features selected
classifier.fit(X_train, y_train)
print 'Out-of-sample accuracy: %0.3f' % classifier.score(X_test, y_test)



In [ ]:

    
from sklearn.linear_model import RandomizedLogisticRegression
selector = RandomizedLogisticRegression(n_resampling=300, random_state=101)
selector.fit(X_train, y_train)
print 'Variables selected: %i' % sum(selector.get_support()!=0)
X_train_s = selector.transform(X_train)
X_test_s = selector.transform(X_test)
classifier.fit(X_train_s, y_train)
print 'Out-of-sample accuracy: %0.3f' % classifier.score(X_test_s, y_test)

	sepal length	sepal width	petal length	petal width	target
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa

	sepal length	sepal width	petal length	petal width
count	150.000000	150.000000	150.000000	150.000000
mean	5.843333	3.054000	3.758667	1.198667
std	0.828066	0.433594	1.764420	0.763161
min	4.300000	2.000000	1.000000	0.100000
25%	5.100000	2.800000	1.600000	0.300000
50%	5.800000	3.000000	4.350000	1.300000
75%	6.400000	3.300000	5.100000	1.800000
max	7.900000	4.400000	6.900000	2.500000